Midterm Code

Importing the Data

diamonds = data.frame(read.csv("diamonds.csv"))
diamonds$color = factor(diamonds$color, order=T, levels = c('J', 'I', 'H', 'G', 'F', 'E', 'D'))
diamonds$clarity = factor(diamonds$clarity, order=T, levels = c('I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'))
diamonds$cut = factor(diamonds$cut, order=T, levels = c('Fair', 'Good', 'Very Good', 'Premium', 'Ideal'))
colnames(diamonds)[1] = "ID"

EDA Code

summary table

sum = xkablesummary(diamonds, title = "Summary of Diamonds")
sum
Summary of Diamonds
ID carat cut color clarity depth table price x y z
Min Min. : 1 Min. :0.20 Fair : 1610 J: 2808 SI1 :13065 Min. :43.0 Min. :43.0 Min. : 326 Min. : 0.00 Min. : 0.0 Min. : 0.0
Q1 1st Qu.:13486 1st Qu.:0.40 Good : 4906 I: 5422 VS2 :12258 1st Qu.:61.0 1st Qu.:56.0 1st Qu.: 950 1st Qu.: 4.71 1st Qu.: 4.7 1st Qu.: 2.9
Median Median :26970 Median :0.70 Very Good:12082 H: 8304 SI2 : 9194 Median :61.8 Median :57.0 Median : 2401 Median : 5.70 Median : 5.7 Median : 3.5
Mean Mean :26970 Mean :0.80 Premium :13791 G:11292 VS1 : 8171 Mean :61.7 Mean :57.5 Mean : 3933 Mean : 5.73 Mean : 5.7 Mean : 3.5
Q3 3rd Qu.:40455 3rd Qu.:1.04 Ideal :21551 F: 9542 VVS2 : 5066 3rd Qu.:62.5 3rd Qu.:59.0 3rd Qu.: 5324 3rd Qu.: 6.54 3rd Qu.: 6.5 3rd Qu.: 4.0
Max Max. :53940 Max. :5.01 NA E: 9797 VVS1 : 3655 Max. :79.0 Max. :95.0 Max. :18823 Max. :10.74 Max. :58.9 Max. :31.8
NA NA NA NA D: 6775 (Other): 2531 NA NA NA NA NA NA

Categorical Data Exploration

barplot(table(diamonds$color), col = c(2, 3, 4, 7, 6, 8, 9), main = 'Barplot of Color', xlab = "Color", ylab = "Frequency")

color plot

barplot(table(diamonds$clarity), col = c(2, 3, 4, 7, 6, 8, 9, 5), main = 'Barplot of Clarity', xlab = "Clarity", ylab = "Frequency")

clarity plot

barplot(table(diamonds$cut), col = c(6, 7, 8, 3, 4), main = 'Barplot of Cut', xlab = "Cut", ylab = "Frequency")

cut_count = dplyr::count(diamonds, cut)
colnames(cut_count)[colnames(cut_count) == 'cut'] <- 'Cut'
cut_count$n = cut_count$n/sum(cut_count$n)

loadPkg("ggplot2")

ggplot(cut_count, aes(x = "", y = n, fill = Cut)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  geom_text(size = 4, aes(label = paste0(round(n, 2), "")), position = position_stack(vjust = 0.5))+
  scale_fill_manual(values = c(6, 7, 8, 3, 4))+
  labs(title='Pie Chart of Cut')

cut plot

Continious Data Exploration

In this section we will look at all the continuous variables besides price.

diaNO = subset(diamonds, x > 0 & y > 0 & z > 0 & y <30 & z <30)
ggplot(diaNO, aes(x, y, color=z)) +
  geom_point(size = 8, alpha = 0.6) +
  ggtitle('Y versus X versus Z for Diamonds (Outliers Removed)') +
  scale_color_gradient2(midpoint = 5, low="blue", mid="green", high="red")

x vs y vs z

ggplot(diamonds, aes(y=table)) + 
  geom_histogram(col="black", 
                 fill="green", 
                  alpha = .8,
                 binwidth = 1.5) +
  labs(title='Histogram of Table') +
  coord_flip() +
  labs(y="Table", x="Frequency")

table plot

ggplot(diamonds, aes(y=depth)) + 
  geom_histogram(col="black", 
                 fill="red", 
                  alpha = .8,
                 binwidth = 1) +
  coord_flip() +
  labs(title='Histogram of Depth') +
  labs(y="Depth", x="Frequency")

depth plot

ggplot(diamonds, aes(y=carat)) + 
  geom_boxplot() + 
  geom_boxplot(colour="black", fill="orange", outlier.colour="orange", outlier.size=5, alpha = 0.8) +
  labs(title = 'Boxplot of Carat')+
  labs(y="Carat")

carat plot

Price Exploration

ggplot(diamonds, aes(y=price)) + 
  geom_boxplot() + 
  geom_boxplot(colour="black", fill="blue", outlier.colour="blue", outlier.size=5, alpha = 0.8) +
  labs(title = 'Boxplot of Price')

ggplot(diamonds, aes(y=price)) + 
  geom_histogram(col="black", 
                 fill="blue", 
                  alpha = .8,
                 binwidth = 300) +
  coord_flip() +
  labs(title='Histogram of Price') +
  labs(y="Price", x="Frequency")

qqnorm(diamonds$price, main="Q-Q plot of Price", col = 'blue', ylab = 'Price') 
qqline(diamonds$price)

price plot

Linear Regression to predict price

single variables

one model for each individual

modelcarat <- lm(price ~ carat,data=diamonds)
modeldepth <- lm(price ~ depth,data=diamonds)
modeltable <- lm(price ~ table,data=diamonds)
modelx <- lm(price ~ x,data=diamonds)
modely <- lm(price ~ y,data=diamonds)
modelz <- lm(price ~ z,data=diamonds)
loadPkg("sjPlot")
loadPkg("sjmisc")
loadPkg("sjlabelled")
tab_model(modelcarat)
  price
Predictors Estimates CI p
(Intercept) -2256.36 -2281.95 – -2230.77 <0.001
carat 7756.43 7728.86 – 7784.00 <0.001
Observations 53940
R2 / R2 adjusted 0.849 / 0.849
tab_model(modeldepth)
  price
Predictors Estimates CI p
(Intercept) 5763.67 4312.17 – 7215.16 <0.001
depth -29.65 -53.15 – -6.15 0.013
Observations 53940
R2 / R2 adjusted 0.000 / 0.000
tab_model(modeltable)
  price
Predictors Estimates CI p
(Intercept) -9109.05 -9968.41 – -8249.68 <0.001
table 226.98 212.04 – 241.93 <0.001
Observations 53940
R2 / R2 adjusted 0.016 / 0.016
tab_model(modelx)
  price
Predictors Estimates CI p
(Intercept) -14094.06 -14175.85 – -14012.26 <0.001
x 3145.41 3131.41 – 3159.42 <0.001
Observations 53940
R2 / R2 adjusted 0.782 / 0.782
tab_model(modely)
  price
Predictors Estimates CI p
(Intercept) -13402.03 -13488.39 – -13315.66 <0.001
y 3022.89 3008.12 – 3037.66 <0.001
Observations 53940
R2 / R2 adjusted 0.749 / 0.749
tab_model(modelz)
  price
Predictors Estimates CI p
(Intercept) -13296.57 -13384.05 – -13209.08 <0.001
z 4868.79 4844.55 – 4893.04 <0.001
Observations 53940
R2 / R2 adjusted 0.742 / 0.742

ANOVA of models

anovaRes = anova(modelcarat,modelx,modely,modelz)
xkabledply(anovaRes, title = "ANOVA comparison between the models")
ANOVA comparison between the models
Res.Df RSS Df Sum of Sq F Pr(>F)
53938 1.29e+11 NA NA NA NA
53938 1.87e+11 0 -5.76e+10 NA NA
53938 2.16e+11 0 -2.86e+10 NA NA
53938 2.22e+11 0 -6.18e+09 NA NA
loadPkg("modelr")
df.with.predfin.from.model <- add_predictions(diamonds,modelcarat)
head(df.with.predfin.from.model)
loadPkg("ggplot2")
ggplot(df.with.predfin.from.model,aes(price,pred))+geom_point(aes(price,pred), alpha =0.2)+geom_line(aes(pred), colour="red", size=1) + labs(title = "Carat Linear Model to Predict Price", x = "Price", y = "Prediction") + xlim(c(0, 20000))

loadPkg("car")
avPlots(modelcarat)

Carat is the best singular model

multinumeric variables

diamondsnum =  subset(diamonds, select = c("price", "carat", "depth", "table", "x", "y", "z"))
diamondscor = cor(diamondsnum) 
loadPkg("corrplot")
corrplot.mixed(diamondscor)

high correlation between many variables

#loadPkg("lattice") 
#pairs(diamondsnum[1:4])

removed the pairs plot for now cause I did not use in presentation and it takes too long to run

modelcaratx <- lm(price ~ carat+x,data=diamonds)
tab_model(modelcaratx)
  price
Predictors Estimates CI p
(Intercept) 1737.95 1534.85 – 1941.05 <0.001
carat 10125.99 10003.38 – 10248.59 <0.001
x -1026.86 -1078.67 – -975.05 <0.001
Observations 53940
R2 / R2 adjusted 0.853 / 0.853
xkablevif(modelcaratx)
VIFs of the model
carat x
20.3 20.3
modelcarattable <- lm(price ~ carat+table,data=diamonds)
tab_model(modelcarattable)
  price
Predictors Estimates CI p
(Intercept) 1961.99 1625.24 – 2298.74 <0.001
carat 7820.04 7792.16 – 7847.92 <0.001
table -74.30 -80.22 – -68.39 <0.001
Observations 53940
R2 / R2 adjusted 0.851 / 0.851
xkablevif(modelcarattable)
VIFs of the model
carat table
1.03 1.03
modeldepthx <- lm(price ~ depth+x,data=diamonds)
tab_model(modeldepthx)
  price
Predictors Estimates CI p
(Intercept) -16116.57 -16800.73 – -15432.42 <0.001
depth 32.66 21.69 – 43.62 <0.001
x 3146.47 3132.46 – 3160.47 <0.001
Observations 53940
R2 / R2 adjusted 0.782 / 0.782
xkablevif(modeldepthx)
VIFs of the model
depth x
1 1

Every multinumeric model is either a worse predictor, or has higher vif values

There is no way to fix this from looking online

multi numeric+continious variables

trying out different models

modelcaratcut <- lm(price ~ carat+cut, data = diamonds)
tab_model(modelcaratcut)
  price
Predictors Estimates CI p
(Intercept) -2701.38 -2731.62 – -2671.13 <0.001
carat 7871.08 7843.68 – 7898.48 <0.001
cut [linear] 1239.80 1188.64 – 1290.96 <0.001
cut [quadratic] -528.60 -573.94 – -483.26 <0.001
cut [cubic] 367.91 328.29 – 407.53 <0.001
cut [4th degree] 74.59 42.76 – 106.42 <0.001
Observations 53940
R2 / R2 adjusted 0.856 / 0.856
xkablevif(modelcaratcut)
VIFs of the model
carat cut.C cut.L cut.Q cut^4
1.04 2 2.67 1.75 1.24
modelcaratclarity <- lm(price ~ carat+clarity, data = diamonds)
tab_model(modelcaratclarity)
  price
Predictors Estimates CI p
(Intercept) -2977.27 -3002.97 – -2951.57 <0.001
carat 8440.06 8415.26 – 8464.85 <0.001
clarity [linear] 4216.78 4150.81 – 4282.74 <0.001
clarity [quadratic] -1931.41 -1993.87 – -1868.94 <0.001
clarity [cubic] 1005.85 952.16 – 1059.54 <0.001
clarity [4th degree] -480.18 -523.19 – -437.17 <0.001
clarity [5th degree] 283.94 248.71 – 319.18 <0.001
clarity [6th degree] 12.66 -18.07 – 43.40 0.419
clarity [7th degree] 198.05 170.97 – 225.13 <0.001
Observations 53940
R2 / R2 adjusted 0.895 / 0.895
xkablevif(modelcaratclarity)
VIFs of the model
carat clarity.C clarity.L clarity.Q clarity^4 clarity^5 clarity^6 clarity^7
1.16 2.36 1.87 2.24 1.92 1.48 1.28 1.12

This model is the best

modelcaratcolor <- lm(price ~ carat+color, data = diamonds)
tab_model(modelcaratcolor)
  price
Predictors Estimates CI p
(Intercept) -2702.23 -2729.25 – -2675.22 <0.001
carat 8066.62 8039.11 – 8094.14 <0.001
color [linear] 1572.20 1528.46 – 1615.94 <0.001
color [quadratic] -741.14 -781.13 – -701.16 <0.001
color [cubic] 122.70 85.17 – 160.22 <0.001
color [4th degree] 78.77 44.30 – 113.23 <0.001
color [5th degree] 144.74 112.16 – 177.32 <0.001
color [6th degree] -180.75 -210.30 – -151.19 <0.001
Observations 53940
R2 / R2 adjusted 0.864 / 0.864
xkablevif(modelcaratcolor)
VIFs of the model
carat color.C color.L color.Q color^4 color^5 color^6
1.1 1.28 1.22 1.2 1.16 1.06 1.03
modelcaratccc <- lm(price ~ carat+color+cut+clarity, data = diamonds)
tab_model(modelcaratccc)
  price
Predictors Estimates CI p
(Intercept) -3710.60 -3738.01 – -3683.20 <0.001
carat 8886.13 8862.54 – 8909.72 <0.001
color [linear] 1910.29 1875.57 – 1945.00 <0.001
color [quadratic] -627.95 -659.55 – -596.36 <0.001
color [cubic] 171.96 142.42 – 201.50 <0.001
color [4th degree] 21.68 -5.45 – 48.80 0.117
color [5th degree] 85.94 60.31 – 111.57 <0.001
color [6th degree] -49.99 -73.29 – -26.68 <0.001
cut [linear] 698.91 659.05 – 738.76 <0.001
cut [quadratic] -327.69 -362.79 – -292.58 <0.001
cut [cubic] 180.57 150.07 – 211.06 <0.001
cut [4th degree] -1.21 -25.63 – 23.21 0.923
clarity [linear] 4217.53 4157.11 – 4277.96 <0.001
clarity [quadratic] -1832.41 -1888.91 – -1775.90 <0.001
clarity [cubic] 923.27 874.90 – 971.64 <0.001
clarity [4th degree] -361.99 -400.68 – -323.31 <0.001
clarity [5th degree] 216.62 185.04 – 248.19 <0.001
clarity [6th degree] 2.11 -25.41 – 29.62 0.881
clarity [7th degree] 110.34 86.07 – 134.61 <0.001
Observations 53940
R2 / R2 adjusted 0.916 / 0.916
xkablevif(modelcaratccc)
VIFs of the model
carat clarity.C clarity.L clarity.Q clarity^4 clarity^5 clarity^6 clarity^7 color.C color.L color.Q color^4 color^5 color^6 cut.C cut.L cut.Q cut^4
1.31 1.31 1.23 1.21 1.17 1.06 1.04 2.08 2.73 1.77 1.24 2.48 1.91 2.27 1.94 1.48 1.28 1.12

Now this model is the best

df.with.predfin.from.model <- add_predictions(diamonds,modelcaratccc)
head(df.with.predfin.from.model)
loadPkg("ggplot2")
ggplot(df.with.predfin.from.model,aes(price,pred))+geom_point(aes(price,pred), alpha =0.2)+geom_line(aes(pred), colour="red", size=1) + labs(title = "Carat + Color + Cut + Clarity Linear Model", x = "Price", y = "Prediction") + xlim(c(0, 20000))

almost looks like a polynomial curve?

interaction terms

modelcaratccci <- lm(price ~ carat * clarity, data = diamonds)
tab_model(modelcaratccci)
  price
Predictors Estimates CI p
(Intercept) -2703.99 -2736.96 – -2671.03 <0.001
carat 8746.14 8709.34 – 8782.95 <0.001
clarity [linear] -531.37 -658.86 – -403.88 <0.001
clarity [quadratic] 470.47 345.59 – 595.36 <0.001
clarity [cubic] -926.21 -1032.99 – -819.43 <0.001
clarity [4th degree] 693.74 609.23 – 778.25 <0.001
clarity [5th degree] -514.17 -581.92 – -446.41 <0.001
clarity [6th degree] 166.53 108.70 – 224.36 <0.001
clarity [7th degree] -24.58 -74.87 – 25.72 0.338
carat * clarity [linear] 5496.25 5361.14 – 5631.37 <0.001
carat * clarity
[quadratic]
-1037.76 -1165.70 – -909.82 <0.001
carat * clarity [cubic] 1469.89 1354.08 – 1585.71 <0.001
carat * clarity [4th
degree]
-943.92 -1045.34 – -842.51 <0.001
carat * clarity [5th
degree]
674.63 586.02 – 763.23 <0.001
carat * clarity [6th
degree]
-30.09 -106.66 – 46.48 0.441
carat * clarity [7th
degree]
304.46 242.54 – 366.39 <0.001
Observations 53940
R2 / R2 adjusted 0.909 / 0.909
xkablevif(modelcaratccci)
VIFs of the model
carat carat:clarity.C carat:clarity.L carat:clarity.Q carat:clarity^4 carat:clarity^5 carat:clarity^6 carat:clarity^7 clarity.C clarity.L clarity.Q clarity^4 clarity^5 clarity^6 clarity^7
2.94 10.1 8.59 10.2 8.52 6.28 5.22 4.44 10.2 8.21 10 11.6 9.9 7.65 5.19

pretty good model

modelcaratccci2 <- lm(price ~ carat + (color+cut+clarity)^2, data = diamonds)
tab_model(modelcaratccci2)
  price
Predictors Estimates CI p
(Intercept) -3638.97 -3676.26 – -3601.68 <0.001
carat 8881.10 8857.62 – 8904.58 <0.001
color [linear] 2336.13 2268.01 – 2404.24 <0.001
color [quadratic] -388.93 -451.66 – -326.21 <0.001
color [cubic] 178.61 121.20 – 236.02 <0.001
color [4th degree] 52.29 0.17 – 104.40 0.049
color [5th degree] 157.27 110.62 – 203.91 <0.001
color [6th degree] -25.23 -66.07 – 15.62 0.226
cut [linear] 746.14 659.74 – 832.54 <0.001
cut [quadratic] -345.13 -421.05 – -269.21 <0.001
cut [cubic] 212.89 153.99 – 271.79 <0.001
cut [4th degree] -1.07 -46.51 – 44.36 0.963
clarity [linear] 4479.14 4360.03 – 4598.25 <0.001
clarity [quadratic] -1635.44 -1745.23 – -1525.66 <0.001
clarity [cubic] 1013.53 914.76 – 1112.29 <0.001
clarity [4th degree] -369.33 -457.75 – -280.91 <0.001
clarity [5th degree] 173.53 98.12 – 248.95 <0.001
clarity [6th degree] -6.35 -66.48 – 53.78 0.836
clarity [7th degree] 92.66 47.40 – 137.92 <0.001
color [linear] * cut
[linear]
-156.52 -285.46 – -27.59 0.017
color [quadratic] * cut
[linear]
-6.35 -126.94 – 114.23 0.918
color [cubic] * cut
[linear]
191.17 76.29 – 306.04 0.001
color [4th degree] * cut
[linear]
115.55 5.20 – 225.90 0.040
color [5th degree] * cut
[linear]
4.62 -96.38 – 105.62 0.929
color [6th degree] * cut
[linear]
-44.86 -137.62 – 47.90 0.343
color [linear] * cut
[quadratic]
17.55 -96.35 – 131.44 0.763
color [quadratic] * cut
[quadratic]
-10.65 -117.32 – 96.03 0.845
color [cubic] * cut
[quadratic]
-117.92 -219.52 – -16.32 0.023
color [4th degree] * cut
[quadratic]
-12.06 -109.33 – 85.21 0.808
color [5th degree] * cut
[quadratic]
100.25 10.85 – 189.65 0.028
color [6th degree] * cut
[quadratic]
-3.23 -85.19 – 78.72 0.938
color [linear] * cut
[cubic]
-315.71 -413.50 – -217.92 <0.001
color [quadratic] * cut
[cubic]
46.00 -46.63 – 138.63 0.330
color [cubic] * cut
[cubic]
127.27 39.38 – 215.17 0.005
color [4th degree] * cut
[cubic]
33.14 -50.13 – 116.42 0.435
color [5th degree] * cut
[cubic]
-139.38 -217.23 – -61.53 <0.001
color [6th degree] * cut
[cubic]
70.04 -2.21 – 142.29 0.057
color [linear] * cut [4th
degree]
-132.75 -211.30 – -54.20 0.001
color [quadratic] * cut
[4th degree]
14.70 -60.42 – 89.82 0.701
color [cubic] * cut [4th
degree]
-68.91 -139.77 – 1.95 0.057
color [4th degree] * cut
[4th degree]
25.37 -40.91 – 91.66 0.453
color [5th degree] * cut
[4th degree]
82.61 19.92 – 145.31 0.010
color [6th degree] * cut
[4th degree]
9.26 -49.18 – 67.70 0.756
color [linear] * clarity
[linear]
380.39 147.05 – 613.72 0.001
color [quadratic] *
clarity [linear]
1324.92 1107.69 – 1542.15 <0.001
color [cubic] * clarity
[linear]
428.25 233.14 – 623.36 <0.001
color [4th degree] *
clarity [linear]
515.76 342.37 – 689.14 <0.001
color [5th degree] *
clarity [linear]
-100.04 -251.31 – 51.23 0.195
color [6th degree] *
clarity [linear]
142.79 11.31 – 274.27 0.033
color [linear] * clarity
[quadratic]
1718.88 1495.84 – 1941.92 <0.001
color [quadratic] *
clarity [quadratic]
1135.83 928.47 – 1343.18 <0.001
color [cubic] * clarity
[quadratic]
350.58 163.45 – 537.70 <0.001
color [4th degree] *
clarity [quadratic]
168.49 1.28 – 335.69 0.048
color [5th degree] *
clarity [quadratic]
351.43 205.15 – 497.71 <0.001
color [6th degree] *
clarity [quadratic]
59.64 -66.93 – 186.21 0.356
color [linear] * clarity
[cubic]
88.87 -105.48 – 283.22 0.370
color [quadratic] *
clarity [cubic]
1046.58 865.45 – 1227.72 <0.001
color [cubic] * clarity
[cubic]
515.80 352.99 – 678.62 <0.001
color [4th degree] *
clarity [cubic]
242.40 98.06 – 386.74 0.001
color [5th degree] *
clarity [cubic]
-200.08 -326.45 – -73.70 0.002
color [6th degree] *
clarity [cubic]
100.17 -9.02 – 209.37 0.072
color [linear] * clarity
[4th degree]
777.96 620.19 – 935.73 <0.001
color [quadratic] *
clarity [4th degree]
206.65 58.48 – 354.81 0.006
color [cubic] * clarity
[4th degree]
234.71 103.03 – 366.39 <0.001
color [4th degree] *
clarity [4th degree]
100.72 -13.75 – 215.19 0.085
color [5th degree] *
clarity [4th degree]
265.99 165.36 – 366.62 <0.001
color [6th degree] *
clarity [4th degree]
-5.50 -92.89 – 81.90 0.902
color [linear] * clarity
[5th degree]
143.16 13.26 – 273.06 0.031
color [quadratic] *
clarity [5th degree]
403.98 281.33 – 526.63 <0.001
color [cubic] * clarity
[5th degree]
158.89 50.22 – 267.57 0.004
color [4th degree] *
clarity [5th degree]
153.65 60.42 – 246.88 0.001
color [5th degree] *
clarity [5th degree]
-85.84 -168.85 – -2.84 0.043
color [6th degree] *
clarity [5th degree]
-16.20 -88.41 – 56.01 0.660
color [linear] * clarity
[6th degree]
299.99 189.98 – 410.00 <0.001
color [quadratic] *
clarity [6th degree]
111.82 7.88 – 215.77 0.035
color [cubic] * clarity
[6th degree]
274.00 180.41 – 367.59 <0.001
color [4th degree] *
clarity [6th degree]
-58.39 -139.50 – 22.72 0.158
color [5th degree] *
clarity [6th degree]
2.08 -71.90 – 76.06 0.956
color [6th degree] *
clarity [6th degree]
17.90 -46.48 – 82.27 0.586
color [linear] * clarity
[7th degree]
309.75 223.47 – 396.04 <0.001
color [quadratic] *
clarity [7th degree]
-57.62 -139.29 – 24.04 0.167
color [cubic] * clarity
[7th degree]
24.45 -51.53 – 100.43 0.528
color [4th degree] *
clarity [7th degree]
40.22 -27.54 – 107.98 0.245
color [5th degree] *
clarity [7th degree]
75.03 10.60 – 139.47 0.022
color [6th degree] *
clarity [7th degree]
32.36 -24.66 – 89.38 0.266
cut [linear] * clarity
[linear]
96.57 -221.04 – 414.17 0.551
cut [quadratic] * clarity
[linear]
-683.67 -964.58 – -402.77 <0.001
cut [cubic] * clarity
[linear]
-241.08 -456.94 – -25.22 0.029
cut [4th degree] *
clarity [linear]
-124.53 -295.30 – 46.23 0.153
cut [linear] * clarity
[quadratic]
424.52 132.64 – 716.40 0.004
cut [quadratic] * clarity
[quadratic]
-381.73 -641.46 – -122.00 0.004
cut [cubic] * clarity
[quadratic]
406.38 202.99 – 609.77 <0.001
cut [4th degree] *
clarity [quadratic]
-27.27 -192.97 – 138.42 0.747
cut [linear] * clarity
[cubic]
434.99 169.75 – 700.22 0.001
cut [quadratic] * clarity
[cubic]
-591.53 -825.89 – -357.17 <0.001
cut [cubic] * clarity
[cubic]
253.30 71.68 – 434.93 0.006
cut [4th degree] *
clarity [cubic]
-24.85 -167.49 – 117.79 0.733
cut [linear] * clarity
[4th degree]
349.97 104.55 – 595.39 0.005
cut [quadratic] * clarity
[4th degree]
-269.02 -482.67 – -55.36 0.014
cut [cubic] * clarity
[4th degree]
191.84 32.48 – 351.19 0.018
cut [4th degree] *
clarity [4th degree]
-116.54 -231.37 – -1.72 0.047
cut [linear] * clarity
[5th degree]
351.69 140.95 – 562.44 0.001
cut [quadratic] * clarity
[5th degree]
-367.28 -549.40 – -185.16 <0.001
cut [cubic] * clarity
[5th degree]
198.17 62.73 – 333.62 0.004
cut [4th degree] *
clarity [5th degree]
-32.54 -125.13 – 60.05 0.491
cut [linear] * clarity
[6th degree]
55.85 -108.32 – 220.01 0.505
cut [quadratic] * clarity
[6th degree]
-101.12 -243.18 – 40.93 0.163
cut [cubic] * clarity
[6th degree]
124.59 13.87 – 235.31 0.027
cut [4th degree] *
clarity [6th degree]
-9.78 -87.16 – 67.61 0.804
cut [linear] * clarity
[7th degree]
16.62 -104.82 – 138.05 0.789
cut [quadratic] * clarity
[7th degree]
25.90 -80.17 – 131.98 0.632
cut [cubic] * clarity
[7th degree]
-36.93 -123.74 – 49.87 0.404
cut [4th degree] *
clarity [7th degree]
66.55 2.05 – 131.06 0.043
Observations 53940
R2 / R2 adjusted 0.918 / 0.918
xkablevif(modelcaratccci2)
VIFs of the model
carat clarity.C clarity.L clarity.Q clarity^4 clarity^5 clarity^6 clarity^7 color.C color.C:clarity.C color.C:clarity.L color.C:clarity.Q color.C:clarity^4 color.C:clarity^5 color.C:clarity^6 color.C:clarity^7 color.C:cut.C color.C:cut.L color.C:cut.Q color.C:cut^4 color.L color.L:clarity.C color.L:clarity.L color.L:clarity.Q color.L:clarity^4 color.L:clarity^5 color.L:clarity^6 color.L:clarity^7 color.L:cut.C color.L:cut.L color.L:cut.Q color.L:cut^4 color.Q color.Q:clarity.C color.Q:clarity.L color.Q:clarity.Q color.Q:clarity^4 color.Q:clarity^5 color.Q:clarity^6 color.Q:clarity^7 color.Q:cut.C color.Q:cut.L color.Q:cut.Q color.Q:cut^4 color^4 color^4:clarity.C color^4:clarity.L color^4:clarity.Q color4:clarity4 color4:clarity5 color4:clarity6 color4:clarity7 color^4:cut.C color^4:cut.L color^4:cut.Q color4:cut4 color^5 color^5:clarity.C color^5:clarity.L color^5:clarity.Q color5:clarity4 color5:clarity5 color5:clarity6 color5:clarity7 color^5:cut.C color^5:cut.L color^5:cut.Q color5:cut4 color^6 color^6:clarity.C color^6:clarity.L color^6:clarity.Q color6:clarity4 color6:clarity5 color6:clarity6 color6:clarity7 color^6:cut.C color^6:cut.L color^6:cut.Q color6:cut4 cut.C cut.C:clarity.C cut.C:clarity.L cut.C:clarity.Q cut.C:clarity^4 cut.C:clarity^5 cut.C:clarity^6 cut.C:clarity^7 cut.L cut.L:clarity.C cut.L:clarity.L cut.L:clarity.Q cut.L:clarity^4 cut.L:clarity^5 cut.L:clarity^6 cut.L:clarity^7 cut.Q cut.Q:clarity.C cut.Q:clarity.L cut.Q:clarity.Q cut.Q:clarity^4 cut.Q:clarity^5 cut.Q:clarity^6 cut.Q:clarity^7 cut^4 cut^4:clarity.C cut^4:clarity.L cut^4:clarity.Q cut4:clarity4 cut4:clarity5 cut4:clarity6 cut4:clarity7
1.34 5.18 5 4.68 4.43 3.62 3.27 10 13.1 6.79 4.42 9.89 7.42 9.76 10.4 8.7 6.31 4.02 3.87 4.02 3.82 4.12 3.46 3.56 3.22 3.32 3.1 3.28 2.81 2.8 2.04 2.17 2 2.04 1.84 1.87 1.49 1.55 1.46 1.44 1.33 1.31 4.13 4.5 3.69 3.33 2.69 2.48 4.91 5.27 4.34 4.12 3.14 2.94 4.69 4.84 4.03 3.58 2.86 2.5 3.57 3.73 3.05 2.65 2.1 1.88 2.85 2.95 2.41 2.01 1.65 1.44 2.38 2.42 2.05 1.73 1.48 1.28 1.62 1.69 1.49 1.39 1.23 1.15 16.1 12.4 6.23 4.22 15.3 14 7.08 4.99 16.2 13.3 6.82 4.74 16.5 12.7 6.28 3.52 13.7 10.7 5.19 2.69 9.43 7.42 3.88 2.17 6.02 4.68 2.67 1.62

high VIF values

modelcaratccci3 <- lm(price ~ (carat + clarity)^2, data = diamonds)
tab_model(modelcaratccci3)
  price
Predictors Estimates CI p
(Intercept) -2703.99 -2736.96 – -2671.03 <0.001
carat 8746.14 8709.34 – 8782.95 <0.001
clarity [linear] -531.37 -658.86 – -403.88 <0.001
clarity [quadratic] 470.47 345.59 – 595.36 <0.001
clarity [cubic] -926.21 -1032.99 – -819.43 <0.001
clarity [4th degree] 693.74 609.23 – 778.25 <0.001
clarity [5th degree] -514.17 -581.92 – -446.41 <0.001
clarity [6th degree] 166.53 108.70 – 224.36 <0.001
clarity [7th degree] -24.58 -74.87 – 25.72 0.338
carat * clarity [linear] 5496.25 5361.14 – 5631.37 <0.001
carat * clarity
[quadratic]
-1037.76 -1165.70 – -909.82 <0.001
carat * clarity [cubic] 1469.89 1354.08 – 1585.71 <0.001
carat * clarity [4th
degree]
-943.92 -1045.34 – -842.51 <0.001
carat * clarity [5th
degree]
674.63 586.02 – 763.23 <0.001
carat * clarity [6th
degree]
-30.09 -106.66 – 46.48 0.441
carat * clarity [7th
degree]
304.46 242.54 – 366.39 <0.001
Observations 53940
R2 / R2 adjusted 0.909 / 0.909
xkablevif(modelcaratccci3)
VIFs of the model
carat carat:clarity.C carat:clarity.L carat:clarity.Q carat:clarity^4 carat:clarity^5 carat:clarity^6 carat:clarity^7 clarity.C clarity.L clarity.Q clarity^4 clarity^5 clarity^6 clarity^7
2.94 10.1 8.59 10.2 8.52 6.28 5.22 4.44 10.2 8.21 10 11.6 9.9 7.65 5.19

pretty great again, but high VIF

Nothing with interaction terms beats the carat + ccc model

feature selection

loadPkg("leaps")
#This is essentially best fit 
reg.best10 <- regsubsets(price~. , data = diamonds, nvmax = 10, nbest = 1, method = "exhaustive")  # leaps::regsubsets() - Model selection by exhaustive (default) search, forward or backward stepwise, or sequential replacement
#The plot will show the Adjust R^2 when using the variables across the bottom
plot(reg.best10, scale = "adjr2", main = "Exhaustive Adjusted R^2")

all types of models look the same, so just stick to adjusted R squared

reg.forward10 <- regsubsets(price~., data = diamonds, nvmax = 10, nbest = 1, method = "forward")
plot(reg.forward10, scale = "adjr2", main = "Forward Adjusted R^2")

# summary(reg.forward10)

again all the same

reg.forward10 <- regsubsets(price~., data = diamonds, nvmax = 10, nbest = 1, method = "backward")
plot(reg.forward10, scale = "adjr2", main = "Backward Adjusted R^2")

# summary(reg.forward10)

same

reg.forward10 <- regsubsets(price~., data = diamonds, nvmax = 10, nbest = 1, method = "seqrep")
plot(reg.forward10, scale = "adjr2", main = "Sequential Replacement Adjusted R^2")

# summary(reg.forward10)

same

All feature selections reflects our findings, and is all the same basically

PCR modeling to predict price

We first subset the continuous variables from the dataframe to a new dataframe.

diamondscont <- subset(diamonds, select = c ("carat", "depth", "table", "price", "x", "y", "z"))

This data is not scaled and hence we have to standardize the data.

diamondscont <- as.data.frame(scale(diamondscont))
head(diamondscont)
##   carat  depth  table  price     x     y     z
## 1 -1.20 -0.174 -1.100 -0.904 -1.59 -1.54 -1.57
## 2 -1.24 -1.361  1.586 -0.904 -1.64 -1.66 -1.74
## 3 -1.20 -3.385  3.376 -0.904 -1.50 -1.46 -1.74
## 4 -1.07  0.454  0.243 -0.902 -1.36 -1.32 -1.29
## 5 -1.03  1.082  0.243 -0.902 -1.24 -1.21 -1.12
## 6 -1.18  0.733 -0.205 -0.902 -1.60 -1.55 -1.50

Running the PCR model using pls library :

library(pls)
pcr_model <- pcr(price~., data = diamondscont,ncomp=6, validation = "CV")
summary(pcr_model)
## Data:    X dimension: 53940 6 
##  Y dimension: 53940 1
## Fit method: svdpc
## Number of components considered: 6
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV               1    0.452   0.4497   0.4418   0.3884   0.3861   0.3761
## adjCV            1    0.452   0.4497   0.4418   0.3831   0.3865   0.3760
## 
## TRAINING: % variance explained
##        1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## X        65.54    86.94    98.33    99.12    99.78   100.00
## price    79.57    79.77    80.49    85.01    85.21    85.92
validationplot(pcr_model)

We can see from the results of the PCR model that with just one component we can explain around 79 % of variation in price and as the components increase, the variation in price explained increases very marginally so it would make sense to build a model with just one variable.

Linear regression model with 1 component

linear_model_pcr_1a = lm(price~ x   , data= diamondscont)
summary(linear_model_pcr_1a)
## 
## Call:
## lm(formula = price ~ x, data = diamondscont)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.112 -0.317 -0.046  0.244  8.053 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.24e-14   2.01e-03       0        1    
## x           8.84e-01   2.01e-03     440   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.467 on 53938 degrees of freedom
## Multiple R-squared:  0.782,  Adjusted R-squared:  0.782 
## F-statistic: 1.94e+05 on 1 and 53938 DF,  p-value: <2e-16
linear_model_pcr_1b = lm(price~ y   , data= diamondscont)
summary(linear_model_pcr_1b)
## 
## Call:
## lm(formula = price ~ y, data = diamondscont)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -38.21  -0.31  -0.06   0.21   7.88 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.52e-15   2.16e-03       0        1    
## y           8.65e-01   2.16e-03     401   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.501 on 53938 degrees of freedom
## Multiple R-squared:  0.749,  Adjusted R-squared:  0.749 
## F-statistic: 1.61e+05 on 1 and 53938 DF,  p-value: <2e-16
linear_model_pcr_1c = lm(price~ z   , data= diamondscont)
summary(linear_model_pcr_1c)
## 
## Call:
## lm(formula = price ~ z, data = diamondscont)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -34.98  -0.31  -0.06   0.21   8.04 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.56e-15   2.19e-03       0        1    
## z           8.61e-01   2.19e-03     394   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.508 on 53938 degrees of freedom
## Multiple R-squared:  0.742,  Adjusted R-squared:  0.742 
## F-statistic: 1.55e+05 on 1 and 53938 DF,  p-value: <2e-16
linear_model_pcr_1d = lm(price~ depth   , data= diamondscont)
summary(linear_model_pcr_1d)
## 
## Call:
## lm(formula = price ~ depth, data = diamondscont)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.944 -0.748 -0.381  0.350  3.744 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  9.73e-15   4.31e-03    0.00    1.000  
## depth       -1.06e-02   4.31e-03   -2.47    0.013 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1 on 53938 degrees of freedom
## Multiple R-squared:  0.000113,   Adjusted R-squared:  9.48e-05 
## F-statistic: 6.12 on 1 and 53938 DF,  p-value: 0.0134
linear_model_pcr_1e = lm(price~ table   , data= diamondscont)
summary(linear_model_pcr_1e)
## 
## Call:
## lm(formula = price ~ table, data = diamondscont)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.635 -0.690 -0.374  0.343  3.947 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.04e-14   4.27e-03     0.0        1    
## table       1.27e-01   4.27e-03    29.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.992 on 53938 degrees of freedom
## Multiple R-squared:  0.0162, Adjusted R-squared:  0.0161 
## F-statistic:  886 on 1 and 53938 DF,  p-value: <2e-16
linear_model_pcr_1f = lm(price~ carat   , data= diamondscont)
summary(linear_model_pcr_1f)
## 
## Call:
## lm(formula = price ~ carat, data = diamondscont)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.659 -0.202 -0.005  0.135  3.191 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.36e-14   1.67e-03       0        1    
## carat       9.22e-01   1.67e-03     551   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.388 on 53938 degrees of freedom
## Multiple R-squared:  0.849,  Adjusted R-squared:  0.849 
## F-statistic: 3.04e+05 on 1 and 53938 DF,  p-value: <2e-16

We can see from the above models that if we have to choose one variable to best explain the variation it price, it has to be the carat of the diamond because it has the highest adjusted r squared value among the other variables.

If we try to build a model with more than one variable as above, we can see that the adjusted r squared value of the model increases very slightly, which means that the variation in price is explained only a little better than the model with just one variable. If we see the vif values which checks the collinearity of the variable, we can see that all the continuous variables are highly collinear.

Random Forest Classification to predict cut

In this section we will be using the Random Forest model to build a classification model to be able to effectively classify diamonds into ‘Fair’, ‘Good’, ‘Very Good’, ‘Premium’, ‘Ideal’ diamond cuts, using the base features.

Prior to Random Forest Classification, Multi-nomial Logistic regression had been used but a very low accuracy score was found.

We start by removing missing values and outliers to make a good classification.

mydiamonds=diamonds
mydiamonds[ mydiamonds == "?"] <- NA
colSums(is.na(mydiamonds))
# str(mydiamonds)
# boxplot(mydiamonds$price)
# any(is.na.data.frame(mydiamonds_clean))
mydiamonds_clean= subset(mydiamonds,!mydiamonds$price %in% boxplot.stats(mydiamonds$price)$out)

Upon removing outliers, 3538 outliers were detected and removed.

Next, we split our data set into training and testing set with 80% and 20% respectively.

library(randomForest)
require(caTools)
library(MASS)
library(caret)
set.seed(1)
dataset= mydiamonds_clean[,2:11]
test= createDataPartition(dataset$cut,p=.2, list= FALSE)
data_train= dataset[-test,]
data_test= dataset[test,]

Multinomial Logistic Regression

Before we dive into Random Forest Regression, lets perform a multinomial Logistic Regression and take a look at the accuracy score.

require(nnet)
# Training the multinomial model
multinom_model <- multinom(cut ~ ., data = data_test)

# Checking the model
summary(multinom_model)

# Predicting the values for train dataset
classPredicted <- predict(multinom_model, newdata = data_test, "class")

# Confusion matrix
library(caret)
cmlogit=confusionMatrix(data_test$cut,classPredicted)

Logistic Regression has an accuracy score of 65.01%.

Random Forest Regression

With Logsitic regression performing poorly, we will test Random Forest Regression.

First, we tune the Random Forest Model to get the best mtry (number of variables sampled at each split).

# Tuning Random Forest to get the best mtry (number of variables random sampled as candidates at each split)
tune= tuneRF(data_train[,-2],data_train[,2], stepFactor = 0.5, plot=TRUE,
             ntreeTry=75,trace=TRUE, improve=0.05)
## mtry = 3  OOB error = 21.46% 
## Searching left ...
## mtry = 6     OOB error = 21.34% 
## 0.00555 0.05 
## Searching right ...
## mtry = 1     OOB error = 25.41% 
## -0.184 0.05

Note: mtry is the number of predictors sampled for splitting at each node.

The chart above helps us choose the best mtry to get a minimal Out of Bag Error. Obviously, 6 is the best mtry with less than 22% OOB error.

rf<- randomForest(cut~ .,data=data_train,mtry=6, ntree=75) #fitting RandomForest Classification
pred = predict(rf, newdata=data_test[,-2]) # Prediction on test set
cm=confusionMatrix(data_test$cut,pred) #confusion matrix
print(rf)
## 
## Call:
##  randomForest(formula = cut ~ ., data = data_train, mtry = 6,      ntree = 75) 
##                Type of random forest: classification
##                      Number of trees: 75
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 21.4%
## Confusion matrix:
##           Fair Good Very Good Premium Ideal class.error
## Fair      1055   93        21      29     9       0.126
## Good       104 2637       796     125    50       0.290
## Very Good   10  575      5199    1347  1929       0.426
## Premium      0   43       954    7936  1093       0.208
## Ideal        5   31       816     584 14879       0.088

The random forest classifier has been modeled with mtry as 6 and ntree as 75..

xkabledply(cm$table,title="Confusion Matrix")
Confusion Matrix
Fair Good Very Good Premium Ideal
Fair 272 18 2 8 2
Good 34 648 204 24 19
Very Good 3 138 1330 322 472
Premium 0 8 225 1978 296
Ideal 2 9 194 136 3738
cm$overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##       7.90e-01       7.02e-01       7.82e-01       7.98e-01       4.49e-01 
## AccuracyPValue  McnemarPValue 
##       0.00e+00       8.87e-44

The confusion matrix is seen above. Also, we get an accuracy score of 79.012% for the test dataset.

plot(rf)

The graph above shows that as the trees increases the Out of Bag Error reduces. While a larger size of trees greater than 75 does not reduce the out of bag error significantly.

round(importance(rf), 2)
##         MeanDecreaseGini
## carat               1085
## color               1082
## clarity              986
## depth               6933
## table               7925
## price               3057
## x                   3152
## y                   3039
## z                   1541
# Mean Decrease in Gini is the average (mean) of a variable's total decrease in node impurity
varImpPlot(rf)

The mean decrease in Gini coefficient is a measure of how each variable contributes to the homogeneity of the nodes and leaves in the resulting random forest. The higher the value of mean decrease Gini score, the higher the importance of the variable in the model.

In the above chart, table and depth have the highest importance in the Random Forest model.

set.seed(1)
dataset2= mydiamonds_clean[,c(2,3,4,6,7,8,9,10,11)]
test2= createDataPartition(dataset2$cut,p=.2, list= FALSE)
data_train2= dataset2[-test2,]
data_test2= dataset2[test2,]

rf2<- randomForest(cut~ .,data=data_train2,mtry=6, ntree=75) #fitting RandomForest Classification
pred2 = predict(rf2, newdata=data_test2[,-2]) # Prediction on test set
cm2=confusionMatrix(data_test2$cut,pred2) #confusion matrix
print(rf2)

Removing least mean decrease gini feature (clarity) increases the Out of Bag error by 0.1%. We will stick to the first random forest model for predicting diamond cut type with minimal out of bag error.

check= data.frame()
# c('accuracy','precision','sensitivity','specificity')
check=rbind(check,c(round((sum(diag(cmlogit$table))/sum(cmlogit$table))*100,2)))
check=rbind(check,c(round((sum(diag(cm$table))/sum(cm$table))*100,2)))
rownames(check)=c('logistic','random forest')
colnames(check)=c("Accuracy Score")
check$AverageSensitiviy=c((sum(cmlogit$byClass[1:5])/5)*100,(sum(cm$byClass[1:5])/5)*100)
check$AverageSpecificity=c((sum(cmlogit$byClass[6:10])/5)*100,(sum(cm$byClass[6:10])/5)*100)
check$AveragePrecision=c((sum(cmlogit$byClass[21:25])/5)*100,(sum(cm$byClass[21:25])/5)*100)

barplot(height=as.matrix(check),beside=TRUE,names.arg = c("Accuracy","Sensitivity","Specificity","Precision"),legend.text = c("Logistic","RandoForest"),col=c("blue","green"))

The barplot above gives us a pictorial view of how Random Forest surpasses Logistic regression for classifying diamond cut type.

KNN modeling to predict cut

pick numeric columns only

knndata = diamonds[,c(2,3,6,7,8,9,10,11)]
knndata = as.data.frame(scale(knndata[,c(1,3,4,5,6,7,8)], center = TRUE, scale = TRUE))

split data set

set.seed(1000)
knn_sample <- sample(2, nrow(knndata), replace=TRUE, prob=c(0.75, 0.25))
knn_training <- knndata[knn_sample==1, ]
knn_test <- knndata[knn_sample==2, ]
trainLabels <- diamonds[knn_sample==1, 3]
testLabels <- diamonds[knn_sample==2, 3]

build KNN model with K = 7

loadPkg("FNN")
loadPkg("gmodels")
loadPkg("caret")
knn_pred <- knn(train = knn_training, test = knn_test, cl=trainLabels, k=7)
KNNCross <- CrossTable(testLabels, knn_pred, prop.chisq = FALSE)

check different k values

ResultDf = data.frame( k=numeric(0), Total.Accuracy= numeric(0), row.names = NULL )
for (kval in 3:15) {
  knn_pred <- knn(train = knn_training, test = knn_test, cl=trainLabels, k=kval)
  KNNCross <- CrossTable(testLabels, knn_pred, prop.chisq = FALSE)
  print( paste("k = ", kval) )
  KNNCross
  # get confusion matrix
  cm = confusionMatrix(knn_pred, reference = testLabels ) 
  # get accuracy score
  cmaccu = cm$overall['Accuracy']
  print( paste("Total Accuracy = ", cmaccu ) )
  # store into dataframe
  cmt = data.frame(k=kval, Total.Accuracy = cmaccu, row.names = NULL )
  ResultDf = rbind(ResultDf, cmt)
  print( xkabledply(   as.matrix(cm), title = paste("ConfusionMatrix for k = ",kval ) ) )
  print( xkabledply(data.frame(cm$byClass), title=paste("k = ",kval)) )
}

show different accuracy values

xkabledply(ResultDf, "Total Accuracy Summary")

draw “accuracy vs k” plot

loadPkg("ggplot2")
ggplot(ResultDf,
       aes(x = k, y = Total.Accuracy)) +
  geom_line(color = "orange", size = 1.5) +
  geom_point(size = 3) + 
  labs(title = "accuracy vs k")